# # -*- coding: utf-8 -*-
# import copy
# import math
# import random
#
# import scrapy
# from scrapy import Request
# from scrapy.exceptions import IgnoreRequest
# from datetime import datetime
# from zc_core.model.items import Box
# from zc_core.util.batch_gen import time_to_batch_no
# from zc_core.util.http_util import retry_request
# from hsysmall.rules import parse_catalog, parse_supplier, parse_sku_list, parse_total_page, parse_query_info
# from zc_core.spiders.base import BaseSpider
#
#
# class SkuSpider(BaseSpider):
#     name = 'sku'
#     # 常用链接
#     index_url = 'https://www.hsysmall.com/index.html'
#     sku_list_url = 'https://www.hsysmall.com/list/{}.html'
#
#     start_urls = [index_url]
#
#     def __init__(self, batchNo=None, *args, **kwargs):
#         super(SkuSpider, self).__init__(batchNo=batchNo, *args, **kwargs)
#         # 页数限制
#         self.max_page_limit = 200
#         self.page_size = 25
#
#     def start_requests(self):
#         yield Request(
#             url=self.index_url,
#             headers={
#                 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
#                 'accept-encoding': 'gzip, deflate, br',
#                 'accept-language': 'zh-CN,zh;q=0.9',
#                 'cache-control': 'max-age=0',
#                 'content-type': 'application/x-www-form-urlencoded',
#                 'origin': 'https://www.hsysmall.com',
#                 'referer': 'https://www.hsysmall.com',
#                 'upgrade-insecure-requests': '1',
#                 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3766.400 QQBrowser/10.6.4163.400',
#             },
#             meta={
#                 'reqType': 'catalog',
#                 'batchNo': self.batch_no,
#             },
#             callback=self.parse_catalog,
#             errback=self.error_back,
#         )
#
#     def parse_catalog(self, response):
#         # 品类
#         cats = parse_catalog(response)
#         if cats:
#             self.logger.info('品类: count[%s]' % len(cats))
#             yield Box('catalog', self.batch_no, cats)
#
#             random.shuffle(cats)
#             for cat in cats:
#                 if cat and cat.get('level') == 3:
#                     page = 1
#                     yield self._build_sku_req(
#                         cat=cat,
#                         page=page,
#                         callback=self.parse_sku_list
#                     )
#
#     def _build_sku_req(self, cat, page, callback, min_price=-1, max_price=-1):
#         form_data = {
#             'pageNo': str(page),
#             'cates': str(cat.get('catalogId')),
#             'priceMin': '',
#             'priceMax': ''
#         }
#         if min_price > 0:
#             form_data['priceMin'] = str(min_price)
#         if max_price > 0:
#             form_data['priceMax'] = str(max_price)
#         return scrapy.FormRequest(
#             url=self.sku_list_url.format(cat.get('catalogCode')),
#             method="POST",
#             meta={
#                 'reqType': 'sku',
#                 'batchNo': self.batch_no,
#                 'page': page,
#                 'cat': copy.copy(cat),
#                 'priceMin': min_price,
#                 'priceMax': max_price,
#                 'sort': '',
#             },
#             headers={
#                 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
#                 'accept-encoding': 'gzip, deflate, br',
#                 'accept-language': 'zh-CN,zh;q=0.9',
#                 'cache-control': 'max-age=0',
#                 'content-type': 'application/x-www-form-urlencoded',
#                 'origin': 'https://www.hsysmall.com',
#                 'referer': 'https://www.hsysmall.com',
#                 'upgrade-insecure-requests': '1',
#                 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3766.400 QQBrowser/10.6.4163.400',
#             },
#             formdata=form_data,
#             callback=callback,
#             errback=self.error_back,
#             priority=60,
#         )
#
#     def parse_sku_list(self, response):
#         meta = response.meta
#         cur_page = meta.get("page")
#         cat = meta.get('cat')
#         cat3_id = cat.get('catalogId')
#
#         # 第一页
#         sku_list = parse_sku_list(response)
#         if not sku_list:
#             self.logger.info('空页1: page=%s, cat=%s' % (cur_page, cat3_id))
#         else:
#             self.logger.info('清单1: cat=%s, page=%s, count=%s' % (cat3_id, cur_page, len(sku_list)))
#             yield Box('sku', self.batch_no, sku_list)
#
#             # 商品超过5000
#             total_pages = parse_query_info(response)
#             if total_pages < self.max_page_limit:
#                 self.logger.info('页数1: cat=%s, ttp=%s' % (cat3_id, total_pages))
#                 # 商品量 < 5000
#                 # 直接分页请求
#                 for page in range(2, total_pages + 1):
#                     yield self._build_sku_req(
#                         page=page,
#                         cat=cat,
#                         callback=self.parse_more_sku
#                     )
#             else:
#                 # 商品量 >= 5000 (递归增加请求参数)
#                 cur_min_price = meta.get('priceMin', -1)
#                 cur_max_price = meta.get('priceMax', -1)
#                 if cur_min_price <= 0 or cur_max_price <= 0:
#                     # 加入价格阶梯
#                     self.logger.info('阶梯3: cat=%s, ttp=%s' % (cat3_id, total_pages))
#                     ladders = [
#                         {'min': 0, 'max': 1000},
#                         {'min': 1000, 'max': 5000},
#                         {'min': 5000, 'max': 100000000},
#                     ]
#                     for ladder in ladders:
#                         min_price = ladder.get('min')
#                         max_price = ladder.get('max')
#                         yield self._build_sku_req(
#                             cat=cat,
#                             page=1,
#                             min_price=min_price,
#                             max_price=max_price,
#                             callback=self.parse_sku_list
#                         )
#                 else:
#                     half_price = cur_min_price + math.ceil((cur_max_price - cur_min_price) / 2)
#                     # 价格区间拆分
#                     yield self._build_sku_req(
#                         cat=cat,
#                         page=1,
#                         min_price=cur_min_price,
#                         max_price=half_price,
#                         callback=self.parse_sku_list
#                     )
#                     yield self._build_sku_req(
#                         cat=cat,
#                         page=1,
#                         min_price=half_price,
#                         max_price=cur_max_price,
#                         callback=self.parse_sku_list
#                     )
#                     self.logger.info('折半4: cat=%s, ttp=%s, half=%s' % (cat3_id, total_pages, half_price))
#
#     # 处理sku列表
#     def parse_more_sku(self, response):
#         meta = response.meta
#         cat = meta.get('cat')
#         cat3_id = cat.get('catalogId')
#         cur_page = meta.get('page', 1)
#         cur_min_price = meta.get('priceMin', -1)
#         cur_max_price = meta.get('priceMax', -1)
#         # 商品
#         sku_list = parse_sku_list(response)
#         if sku_list:
#             if cur_min_price < 0 and cur_max_price < 0:
#                 self.logger.info('清单21: cat=%s, page=%s, cnt=%s' % (cat3_id, cur_page, len(sku_list)))
#             else:
#                 self.logger.info('清单22: cat=%s, min=%s, max=%s, page=%s, cnt=%s' % (cat3_id, cur_min_price, cur_max_price, cur_page, len(sku_list)))
#             yield Box('sku', self.batch_no, sku_list)
#         else:
#             if cur_min_price < 0 and cur_max_price < 0:
#                 self.logger.info('空页21: cat=%s, page=%s' % (cat3_id, cur_page))
#             else:
#                 self.logger.info('空页22: cat=%s, min=%s, max=%s, page=%s' % (cat3_id, cur_min_price, cur_max_price, cur_page))
